R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.

library(dplyr)
library(readr)

Load dataset and get variables of interest

# Load data
path <- "/mnt/Data/deidentified_data/full_cohort/preprocessed_data_long.csv"
data <- read_csv(file=path, col_names=TRUE, col_select=-1, na=c("", "NA"))
## New names:
## Rows: 22096 Columns: 95
## ── Column specification
## ──────────────────────────────────────────────────────────── Delimiter: "," chr
## (4): sex, race, smoking_status, substance dbl (60): RandID, mort_age, visit_number,
## time_diff, age, height, weight, dlco,... lgl (28): has_copd, has_ct, has_hosp,
## pr_complete_ever, home_oxygen_ever, hosp_... date (3): diagnosis_date,
## pr_start_date, date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify
## the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Get only required columns
ids <- c("RandID", "visit_number",
         "has_copd", "has_hosp",
         "date", "time_diff",
         "substance")
preds_cts = c(
  "age", # Months
  "height", # m
  "weight", # kg
  "bmi", # kg/m^2
  "fev1", # L
  "fvc", # L
  "fivc", # L
  "fev1_fvc", # %
  "fev1_fev6", # %
  "pef", # L/s
  "mmef", # L/s
  "dlco",
  "spo2"
)
preds_bin = c("hosp_past_year", "sex") # Binary predictors
preds_categ = c("smoking_status", "mmrc") # Categorical
outcomes_bin <- c(
  "hosp_1_year", "hosp_3_year", "hosp_5_year",
  "mort_1_year", "mort_3_year", "mort_5_year"
)
outcomes_surv <- c(
  "mort_next_time", # Time in months
  "hosp_next_time" # Time in months
)
preds <- c( preds_bin, preds_cts, preds_categ)
outcomes <- c(outcomes_bin, outcomes_surv)
data <- data %>% select(all_of(c(ids, preds, outcomes)))
head(data)
## # A tibble: 6 × 32
##   RandID visit_number has_copd has_hosp date       time_diff substance 
##    <dbl>        <dbl> <lgl>    <lgl>    <date>         <dbl> <chr>     
## 1      1            0 TRUE     TRUE     2009-06-01       0   ventolin  
## 2      1            1 TRUE     TRUE     2013-08-01      50.0 ventolin  
## 3      1            2 TRUE     TRUE     2019-12-01     126.  salbutamol
## 4      4            0 TRUE     TRUE     2009-12-01       0   ventolin  
## 5      6            0 TRUE     TRUE     2019-08-01       0   salbutamol
## 6      7            0 TRUE     FALSE    2019-03-01       0   salbutamol
## # ℹ 25 more variables: hosp_past_year <lgl>, sex <chr>, age <dbl>, height <dbl>,
## #   weight <dbl>, bmi <dbl>, fev1 <dbl>, fvc <dbl>, fivc <dbl>, fev1_fvc <dbl>,
## #   fev1_fev6 <dbl>, pef <dbl>, mmef <dbl>, dlco <dbl>, spo2 <dbl>,
## #   smoking_status <chr>, mmrc <dbl>, hosp_1_year <lgl>, hosp_3_year <lgl>,
## #   hosp_5_year <lgl>, mort_1_year <lgl>, mort_3_year <lgl>, mort_5_year <lgl>,
## #   mort_next_time <dbl>, hosp_next_time <dbl>

Look at missingness in each group

library(naniar)
vis_miss(data %>% select(all_of(preds)))

gg_miss_var(data %>% select(preds))

sprintf("Pct missing cases (all preds): %f",
        pct_miss_case(data %>% select(all_of(preds))))
## [1] "Pct missing cases (all preds): 89.930304"
sprintf("Pct missing cases (exclude dlco, fivc): %f",
        pct_miss_case(data %>% 
                        select(all_of(preds), -dlco, -fivc)))
## [1] "Pct missing cases (exclude dlco, fivc): 41.066256"
gg_miss_upset(data %>% select(all_of(preds)))

gg_miss_fct(x=data%>%select(all_of(preds)), fct=smoking_status)

gg_miss_fct(x=data%>%select(all_of(preds)), fct=mmrc)
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_tile()`).

mmrc and spo2 both having highly correlated missing values suggests that they are often measured as a pair (though the presence of mMRC values having non-na spo2 implies other things).

gg_miss_fct(x=data%>%select(all_of(preds)), fct=sex)

gg_miss_fct(x=data%>%select(all_of(preds)), fct=hosp_past_year)

hosp_past_year highlights some potentially interesting effects where hospitalisations in prior years increase the rate of having mmrc and dlco. It is possible these were measured in previous hospitalisations and they have been carried forward.

Also interested in change in missingness vs date

library(ggplot2)
data <- data %>%
  mutate(date = as.numeric(difftime(date, as.Date("2006-01-01"), units="days")))

ggplot(data, aes(x=date, y=spo2)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=mmef)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=fivc)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=dlco)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=mmrc)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=smoking_status)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=fev1_fev6)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=height)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=weight)) + geom_miss_point(alpha=0.01)

ggplot(data, aes(x=date, y=bmi)) + geom_miss_point(alpha=0.1)

data %>% select(smoking_status)
## # A tibble: 22,096 × 1
##    smoking_status
##    <chr>         
##  1 current       
##  2 current       
##  3 current       
##  4 <NA>          
##  5 ex            
##  6 ex            
##  7 ex            
##  8 ex            
##  9 ex            
## 10 ex            
## # ℹ 22,086 more rows
ggplot(data, aes(x=log(101-spo2))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7499 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data, aes(x=spo2^2)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7499 rows containing non-finite outside the scale range
## (`stat_bin()`).

# histogram of dates
ggplot(data, aes(x=date)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution is vaguely uniform.